UCI Adult Data Set

Dataset URL: https://archive.ics.uci.edu/ml/datasets/adult

Predict whether income exceeds $50K/yr based on census data. Also known as "Census Income" dataset.



In [1]:

    
import shutil
import math
from datetime import datetime
import multiprocessing

import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow import data
from tensorflow.python.feature_column import feature_column

print(tf.__version__)









    



/Users/khalidsalama/anaconda/lib/python3.6/importlib/_bootstrap.py:205: RuntimeWarning: compiletime version 3.5 of module 'tensorflow.python.framework.fast_tensor_util' does not match runtime version 3.6
  return f(*args, **kwds)






    



1.4.1



In [2]:

    
MODEL_NAME = 'cenus-model-01'

TRAIN_DATA_FILES_PATTERN = 'data/adult.data.csv'
TEST_DATA_FILES_PATTERN = 'data/adult.test.csv'

RESUME_TRAINING = False
PROCESS_FEATURES = True
EXTEND_FEATURE_COLUMNS = True
MULTI_THREADING = True

Define Dataset Metadata



In [3]:

    
HEADER = ['age', 'workclass', 'fnlwgt', 'education', 'education_num',
               'marital_status', 'occupation', 'relationship', 'race', 'gender',
               'capital_gain', 'capital_loss', 'hours_per_week',
               'native_country', 'income_bracket']

HEADER_DEFAULTS = [[0], [''], [0], [''], [0], [''], [''], [''], [''], [''],
                       [0], [0], [0], [''], ['']]

NUMERIC_FEATURE_NAMES = ['age', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']

CATEGORICAL_FEATURE_NAMES_WITH_VOCABULARY = {
    'gender': ['Female', 'Male'],
    
    'race': ['Amer-Indian-Eskimo', 'Asian-Pac-Islander', 'Black', 'Other', 'White'],
    
    'education': ['Bachelors', 'HS-grad', '11th', 'Masters', '9th', 'Some-college', 
                  'Assoc-acdm', 'Assoc-voc', '7th-8th', 'Doctorate', 'Prof-school', 
                  '5th-6th', '10th', '1st-4th', 'Preschool', '12th'],
    
    'marital_status': ['Married-civ-spouse', 'Divorced', 'Married-spouse-absent', 
                       'Never-married', 'Separated', 'Married-AF-spouse', 'Widowed'],
    
    'relationship': ['Husband', 'Not-in-family', 'Wife', 'Own-child', 'Unmarried', 'Other-relative'],
    
    'workclass': ['Self-emp-not-inc', 'Private', 'State-gov', 'Federal-gov', 'Local-gov', '?', 
                  'Self-emp-inc', 'Without-pay', 'Never-worked']
}

CATEGORICAL_FEATURE_NAMES_WITH_BUCKET_SIZE = {
    'occupation': 50,
    'native_country' : 100
}

CATEGORICAL_FEATURE_NAMES = list(CATEGORICAL_FEATURE_NAMES_WITH_VOCABULARY.keys()) + list(CATEGORICAL_FEATURE_NAMES_WITH_BUCKET_SIZE.keys())

FEATURE_NAMES = NUMERIC_FEATURE_NAMES + CATEGORICAL_FEATURE_NAMES

TARGET_NAME = 'income_bracket'

TARGET_LABELS = ['<=50K', '>50K']

WEIGHT_COLUMN_NAME = 'fnlwgt'

UNUSED_FEATURE_NAMES = list(set(HEADER) - set(FEATURE_NAMES) - {TARGET_NAME} - {WEIGHT_COLUMN_NAME})


print("Header: {}".format(HEADER))
print("Numeric Features: {}".format(NUMERIC_FEATURE_NAMES))
print("Categorical Features: {}".format(CATEGORICAL_FEATURE_NAMES))
print("Target: {} - labels: {}".format(TARGET_NAME, TARGET_LABELS))
print("Unused Features: {}".format(UNUSED_FEATURE_NAMES))









    



Header: ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation', 'relationship', 'race', 'gender', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'income_bracket']
Numeric Features: ['age', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']
Categorical Features: ['gender', 'race', 'education', 'marital_status', 'relationship', 'workclass', 'occupation', 'native_country']
Target: income_bracket - labels: ['<=50K', '>50K']
Unused Features: []

Load and Analyse Dataset



In [4]:

    
TRAIN_DATA_SIZE = 32561
TEST_DATA_SIZE = 16278

train_data = pd.read_csv(TRAIN_DATA_FILES_PATTERN, header=None, names=HEADER )
train_data.head(10)









    Out[4]:







  
    
      
      age
      workclass
      fnlwgt
      education
      education_num
      marital_status
      occupation
      relationship
      race
      gender
      capital_gain
      capital_loss
      hours_per_week
      native_country
      income_bracket
    
  
  
    
      0
      39
      State-gov
      77516
      Bachelors
      13
      Never-married
      Adm-clerical
      Not-in-family
      White
      Male
      2174
      0
      40
      United-States
      <=50K
    
    
      1
      50
      Self-emp-not-inc
      83311
      Bachelors
      13
      Married-civ-spouse
      Exec-managerial
      Husband
      White
      Male
      0
      0
      13
      United-States
      <=50K
    
    
      2
      38
      Private
      215646
      HS-grad
      9
      Divorced
      Handlers-cleaners
      Not-in-family
      White
      Male
      0
      0
      40
      United-States
      <=50K
    
    
      3
      53
      Private
      234721
      11th
      7
      Married-civ-spouse
      Handlers-cleaners
      Husband
      Black
      Male
      0
      0
      40
      United-States
      <=50K
    
    
      4
      28
      Private
      338409
      Bachelors
      13
      Married-civ-spouse
      Prof-specialty
      Wife
      Black
      Female
      0
      0
      40
      Cuba
      <=50K
    
    
      5
      37
      Private
      284582
      Masters
      14
      Married-civ-spouse
      Exec-managerial
      Wife
      White
      Female
      0
      0
      40
      United-States
      <=50K
    
    
      6
      49
      Private
      160187
      9th
      5
      Married-spouse-absent
      Other-service
      Not-in-family
      Black
      Female
      0
      0
      16
      Jamaica
      <=50K
    
    
      7
      52
      Self-emp-not-inc
      209642
      HS-grad
      9
      Married-civ-spouse
      Exec-managerial
      Husband
      White
      Male
      0
      0
      45
      United-States
      >50K
    
    
      8
      31
      Private
      45781
      Masters
      14
      Never-married
      Prof-specialty
      Not-in-family
      White
      Female
      14084
      0
      50
      United-States
      >50K
    
    
      9
      42
      Private
      159449
      Bachelors
      13
      Married-civ-spouse
      Exec-managerial
      Husband
      White
      Male
      5178
      0
      40
      United-States
      >50K



In [5]:

    
train_data.describe()









    Out[5]:







  
    
      
      age
      fnlwgt
      education_num
      capital_gain
      capital_loss
      hours_per_week
    
  
  
    
      count
      32561.000000
      3.256100e+04
      32561.000000
      32561.000000
      32561.000000
      32561.000000
    
    
      mean
      38.581647
      1.897784e+05
      10.080679
      1077.648844
      87.303830
      40.437456
    
    
      std
      13.640433
      1.055500e+05
      2.572720
      7385.292085
      402.960219
      12.347429
    
    
      min
      17.000000
      1.228500e+04
      1.000000
      0.000000
      0.000000
      1.000000
    
    
      25%
      28.000000
      1.178270e+05
      9.000000
      0.000000
      0.000000
      40.000000
    
    
      50%
      37.000000
      1.783560e+05
      10.000000
      0.000000
      0.000000
      40.000000
    
    
      75%
      48.000000
      2.370510e+05
      12.000000
      0.000000
      0.000000
      45.000000
    
    
      max
      90.000000
      1.484705e+06
      16.000000
      99999.000000
      4356.000000
      99.000000

Compute Scaling Statistics for Numeric Columns



In [6]:

    
means = train_data[NUMERIC_FEATURE_NAMES].mean(axis=0)
stdvs = train_data[NUMERIC_FEATURE_NAMES].std(axis=0)
maxs = train_data[NUMERIC_FEATURE_NAMES].max(axis=0)
mins = train_data[NUMERIC_FEATURE_NAMES].min(axis=0)
df_stats = pd.DataFrame({"mean":means, "stdv":stdvs, "max":maxs, "min":mins})
df_stats.head(15)









    Out[6]:







  
    
      
      max
      mean
      min
      stdv
    
  
  
    
      age
      90
      38.581647
      17
      13.640433
    
    
      education_num
      16
      10.080679
      1
      2.572720
    
    
      capital_gain
      99999
      1077.648844
      0
      7385.292085
    
    
      capital_loss
      4356
      87.303830
      0
      402.960219
    
    
      hours_per_week
      99
      40.437456
      1
      12.347429

Save Scaling Statistics



In [7]:

    
df_stats.to_csv(path_or_buf="data/adult.stats.csv", header=True, index=True)

Define Data Input Function

a. Parsing and preprocessing logic



In [8]:

    
def parse_csv_row(csv_row):
    
    columns = tf.decode_csv(csv_row, record_defaults=HEADER_DEFAULTS)
    features = dict(zip(HEADER, columns))
    
    for column in UNUSED_FEATURE_NAMES:
        features.pop(column)
    
    target = features.pop(TARGET_NAME)

    return features, target

def process_features(features):

    capital_indicator = features['capital_gain'] > features['capital_loss']
    features['capital_indicator'] = tf.cast(capital_indicator, dtype=tf.int32)
    
    return features

b. Data pipeline input function



In [9]:

    
def csv_input_fn(files_name_pattern, mode=tf.estimator.ModeKeys.EVAL, 
                 skip_header_lines=0, 
                 num_epochs=None, 
                 batch_size=200):
    
    shuffle = True if mode == tf.estimator.ModeKeys.TRAIN else False
        
    num_threads = multiprocessing.cpu_count() if MULTI_THREADING else 1
     
    print("")
    print("* data input_fn:")
    print("================")
    print("Input file(s): {}".format(files_name_pattern))
    print("Batch size: {}".format(batch_size))
    print("Epoch Count: {}".format(num_epochs))
    print("Mode: {}".format(mode))
    print("Thread Count: {}".format(num_threads))
    print("Shuffle: {}".format(shuffle))
    print("================")
    print("")

    file_names = tf.matching_files(files_name_pattern)
    dataset = data.TextLineDataset(filenames=file_names)
    
    dataset = dataset.skip(skip_header_lines)
    
    if shuffle:
        dataset = dataset.shuffle(buffer_size=2 * batch_size + 1)
    
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(lambda csv_row: parse_csv_row(csv_row), 
                          num_parallel_calls=num_threads)
    
    if PROCESS_FEATURES:
        dataset = dataset.map(lambda features, target: (process_features(features), target), 
                              num_parallel_calls=num_threads)
        
    dataset = dataset.repeat(num_epochs)
    iterator = dataset.make_one_shot_iterator()
    
    features, target = iterator.get_next()
    return features, target



In [10]:

    
features, target = csv_input_fn(files_name_pattern="")
print("Features in CSV: {}".format(list(features.keys())))
print("Target in CSV: {}".format(target))









    



* data input_fn:
================
Input file(s): 
Batch size: 200
Epoch Count: None
Mode: eval
Thread Count: 4
Shuffle: False
================

Features in CSV: ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation', 'relationship', 'race', 'gender', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'capital_indicator']
Target in CSV: Tensor("IteratorGetNext:15", shape=(?,), dtype=string)

Define Feature Columns

a. Load scaling params



In [11]:

    
df_stats = pd.read_csv("data/adult.stats.csv", header=0, index_col=0)
df_stats['feature_name'] = NUMERIC_FEATURE_NAMES
df_stats.head(10)









    Out[11]:







  
    
      
      max
      mean
      min
      stdv
      feature_name
    
  
  
    
      age
      90
      38.581647
      17
      13.640433
      age
    
    
      education_num
      16
      10.080679
      1
      2.572720
      education_num
    
    
      capital_gain
      99999
      1077.648844
      0
      7385.292085
      capital_gain
    
    
      capital_loss
      4356
      87.303830
      0
      402.960219
      capital_loss
    
    
      hours_per_week
      99
      40.437456
      1
      12.347429
      hours_per_week

b. Create feature columns



In [12]:

    
def extend_feature_columns(feature_columns, hparams):
    
    age_buckets = tf.feature_column.bucketized_column(
      feature_columns['age'], boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
    
    education_X_occupation = tf.feature_column.crossed_column(
     ['education', 'occupation'], hash_bucket_size=int(1e4))
    
    age_buckets_X_race = tf.feature_column.crossed_column(
     [age_buckets, feature_columns['race']], hash_bucket_size=int(1e4))
    
    native_country_X_occupation = tf.feature_column.crossed_column(
          ['native_country', 'occupation'], hash_bucket_size=int(1e4))
    
    native_country_embedded = tf.feature_column.embedding_column(
          feature_columns['native_country'], dimension=hparams.embedding_size)
    
    occupation_embedded = tf.feature_column.embedding_column(
          feature_columns['occupation'], dimension=hparams.embedding_size)
    
    education_X_occupation_embedded = tf.feature_column.embedding_column(
          education_X_occupation, dimension=hparams.embedding_size)
    
    native_country_X_occupation_embedded = tf.feature_column.embedding_column(
          native_country_X_occupation, dimension=hparams.embedding_size)
    
    
    feature_columns['age_buckets'] = age_buckets
    feature_columns['education_X_occupation'] = education_X_occupation
    feature_columns['age_buckets_X_race'] = age_buckets_X_race
    feature_columns['native_country_X_occupation'] = native_country_X_occupation
    feature_columns['native_country_embedded'] = native_country_embedded
    feature_columns['occupation_embedded'] = occupation_embedded
    feature_columns['education_X_occupation_embedded'] = education_X_occupation_embedded
    feature_columns['native_country_X_occupation_embedded'] = native_country_X_occupation_embedded
    
    return feature_columns

def standard_scaler(x, mean, stdv):
    return (x-mean)/(stdv)

def maxmin_scaler(x, max_value, min_value):
    return (x-min_value)/(max_value-min_value)  

def get_feature_columns(hparams):
    
    
    numeric_columns = {}
    
    for feature_name in NUMERIC_FEATURE_NAMES:

        feature_mean = df_stats[df_stats.feature_name == feature_name]['mean'].values[0]
        feature_stdv = df_stats[df_stats.feature_name == feature_name]['stdv'].values[0]
        normalizer_fn = lambda x: standard_scaler(x, feature_mean, feature_stdv)
        
        numeric_columns[feature_name] = tf.feature_column.numeric_column(feature_name, 
                                                                         normalizer_fn=normalizer_fn
                                                                        )
    CONSTRUCTED_NUMERIC_FEATURES_NAMES = []
    
    if PROCESS_FEATURES:
        for feature_name in CONSTRUCTED_NUMERIC_FEATURES_NAMES:
            numeric_columns[feature_name] = tf.feature_column.numeric_column(feature_name)
    
    categorical_column_with_vocabulary = \
        {item[0]: tf.feature_column.categorical_column_with_vocabulary_list(item[0], item[1])
         for item in CATEGORICAL_FEATURE_NAMES_WITH_VOCABULARY.items()}
        
    CONSTRUCTED_INDICATOR_FEATURES_NAMES = ['capital_indicator']
    
    categorical_column_with_identity = {}
    
    for feature_name in CONSTRUCTED_INDICATOR_FEATURES_NAMES: 
        categorical_column_with_identity[feature_name] = tf.feature_column.categorical_column_with_identity(feature_name, 
                                                                                                              num_buckets=2,
                                                                                                              default_value=0)
    categorical_column_with_hash_bucket = \
        {item[0]: tf.feature_column.categorical_column_with_hash_bucket(item[0], item[1], dtype=tf.string)
         for item in CATEGORICAL_FEATURE_NAMES_WITH_BUCKET_SIZE.items()}
        
    feature_columns = {}

    if numeric_columns is not None:
        feature_columns.update(numeric_columns)

    if categorical_column_with_vocabulary is not None:
        feature_columns.update(categorical_column_with_vocabulary)
        
    if categorical_column_with_identity is not None:
        feature_columns.update(categorical_column_with_identity)
        
    if categorical_column_with_hash_bucket is not None:
        feature_columns.update(categorical_column_with_hash_bucket)
    
    if EXTEND_FEATURE_COLUMNS:
        feature_columns = extend_feature_columns(feature_columns, hparams)
        
    return feature_columns

feature_columns = get_feature_columns(tf.contrib.training.HParams(num_buckets=5,embedding_size=3))
print("Feature Columns: {}".format(feature_columns))









    



Feature Columns: {'age': _NumericColumn(key='age', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=<function get_feature_columns.<locals>.<lambda> at 0x125cf4b70>), 'education_num': _NumericColumn(key='education_num', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=<function get_feature_columns.<locals>.<lambda> at 0x11ecc2950>), 'capital_gain': _NumericColumn(key='capital_gain', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=<function get_feature_columns.<locals>.<lambda> at 0x11ecc2f28>), 'capital_loss': _NumericColumn(key='capital_loss', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=<function get_feature_columns.<locals>.<lambda> at 0x125d0b950>), 'hours_per_week': _NumericColumn(key='hours_per_week', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=<function get_feature_columns.<locals>.<lambda> at 0x125d0b158>), 'gender': _VocabularyListCategoricalColumn(key='gender', vocabulary_list=('Female', 'Male'), dtype=tf.string, default_value=-1, num_oov_buckets=0), 'race': _VocabularyListCategoricalColumn(key='race', vocabulary_list=('Amer-Indian-Eskimo', 'Asian-Pac-Islander', 'Black', 'Other', 'White'), dtype=tf.string, default_value=-1, num_oov_buckets=0), 'education': _VocabularyListCategoricalColumn(key='education', vocabulary_list=('Bachelors', 'HS-grad', '11th', 'Masters', '9th', 'Some-college', 'Assoc-acdm', 'Assoc-voc', '7th-8th', 'Doctorate', 'Prof-school', '5th-6th', '10th', '1st-4th', 'Preschool', '12th'), dtype=tf.string, default_value=-1, num_oov_buckets=0), 'marital_status': _VocabularyListCategoricalColumn(key='marital_status', vocabulary_list=('Married-civ-spouse', 'Divorced', 'Married-spouse-absent', 'Never-married', 'Separated', 'Married-AF-spouse', 'Widowed'), dtype=tf.string, default_value=-1, num_oov_buckets=0), 'relationship': _VocabularyListCategoricalColumn(key='relationship', vocabulary_list=('Husband', 'Not-in-family', 'Wife', 'Own-child', 'Unmarried', 'Other-relative'), dtype=tf.string, default_value=-1, num_oov_buckets=0), 'workclass': _VocabularyListCategoricalColumn(key='workclass', vocabulary_list=('Self-emp-not-inc', 'Private', 'State-gov', 'Federal-gov', 'Local-gov', '?', 'Self-emp-inc', 'Without-pay', 'Never-worked'), dtype=tf.string, default_value=-1, num_oov_buckets=0), 'capital_indicator': _IdentityCategoricalColumn(key='capital_indicator', num_buckets=2, default_value=0), 'occupation': _HashedCategoricalColumn(key='occupation', hash_bucket_size=50, dtype=tf.string), 'native_country': _HashedCategoricalColumn(key='native_country', hash_bucket_size=100, dtype=tf.string), 'age_buckets': _BucketizedColumn(source_column=_NumericColumn(key='age', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=<function get_feature_columns.<locals>.<lambda> at 0x125cf4b70>), boundaries=(18, 25, 30, 35, 40, 45, 50, 55, 60, 65)), 'education_X_occupation': _CrossedColumn(keys=('education', 'occupation'), hash_bucket_size=10000, hash_key=None), 'age_buckets_X_race': _CrossedColumn(keys=(_BucketizedColumn(source_column=_NumericColumn(key='age', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=<function get_feature_columns.<locals>.<lambda> at 0x125cf4b70>), boundaries=(18, 25, 30, 35, 40, 45, 50, 55, 60, 65)), _VocabularyListCategoricalColumn(key='race', vocabulary_list=('Amer-Indian-Eskimo', 'Asian-Pac-Islander', 'Black', 'Other', 'White'), dtype=tf.string, default_value=-1, num_oov_buckets=0)), hash_bucket_size=10000, hash_key=None), 'native_country_X_occupation': _CrossedColumn(keys=('native_country', 'occupation'), hash_bucket_size=10000, hash_key=None), 'native_country_embedded': _EmbeddingColumn(categorical_column=_HashedCategoricalColumn(key='native_country', hash_bucket_size=100, dtype=tf.string), dimension=3, combiner='mean', initializer=<tensorflow.python.ops.init_ops.TruncatedNormal object at 0x125186b00>, ckpt_to_load_from=None, tensor_name_in_ckpt=None, max_norm=None, trainable=True), 'occupation_embedded': _EmbeddingColumn(categorical_column=_HashedCategoricalColumn(key='occupation', hash_bucket_size=50, dtype=tf.string), dimension=3, combiner='mean', initializer=<tensorflow.python.ops.init_ops.TruncatedNormal object at 0x1251b7cc0>, ckpt_to_load_from=None, tensor_name_in_ckpt=None, max_norm=None, trainable=True), 'education_X_occupation_embedded': _EmbeddingColumn(categorical_column=_CrossedColumn(keys=('education', 'occupation'), hash_bucket_size=10000, hash_key=None), dimension=3, combiner='mean', initializer=<tensorflow.python.ops.init_ops.TruncatedNormal object at 0x125cdc940>, ckpt_to_load_from=None, tensor_name_in_ckpt=None, max_norm=None, trainable=True), 'native_country_X_occupation_embedded': _EmbeddingColumn(categorical_column=_CrossedColumn(keys=('native_country', 'occupation'), hash_bucket_size=10000, hash_key=None), dimension=3, combiner='mean', initializer=<tensorflow.python.ops.init_ops.TruncatedNormal object at 0x125ccdd68>, ckpt_to_load_from=None, tensor_name_in_ckpt=None, max_norm=None, trainable=True)}

Define a DNN Estimator Creation Function

a. Get wide and deep feature columns



In [13]:

    
def get_wide_deep_columns():
    
    feature_columns = list(get_feature_columns(hparams).values())
    
    dense_columns = list(
        filter(lambda column: isinstance(column, feature_column._NumericColumn) |
                              isinstance(column, feature_column._EmbeddingColumn),
               feature_columns
        )
    )

    categorical_columns = list(
        filter(lambda column: isinstance(column, feature_column._VocabularyListCategoricalColumn) |
                              isinstance(column, feature_column._IdentityCategoricalColumn) |
                              isinstance(column, feature_column._BucketizedColumn),
                   feature_columns)
    )
    
    sparse_columns = list(
        filter(lambda column: isinstance(column,feature_column._HashedCategoricalColumn) |
                              isinstance(column, feature_column._CrossedColumn),
               feature_columns)
    )

    indicator_columns = list(
            map(lambda column: tf.feature_column.indicator_column(column),
                categorical_columns)
    )
    
    deep_feature_columns = dense_columns + indicator_columns
    wide_feature_columns = categorical_columns + sparse_columns
    
    return wide_feature_columns, deep_feature_columns

b. Define the estimator



In [14]:

    
def create_DNNComb_estimator(run_config, hparams, print_desc=False):
    
    wide_feature_columns, deep_feature_columns = get_wide_deep_columns()
    
    estimator = tf.estimator.DNNLinearCombinedClassifier(
        
        n_classes=len(TARGET_LABELS),
        label_vocabulary=TARGET_LABELS,
        
        
        dnn_feature_columns = deep_feature_columns,
        linear_feature_columns = wide_feature_columns,
        
        weight_column=WEIGHT_COLUMN_NAME,
        
        dnn_hidden_units= hparams.hidden_units,
        
        dnn_optimizer= tf.train.AdamOptimizer(),
        
        dnn_activation_fn= tf.nn.relu,
        
        config= run_config
    )
    
    
    if print_desc:
        print("")
        print("*Estimator Type:")
        print("================")
        print(type(estimator))
        print("")
        print("*deep columns:")
        print("==============")
        print(deep_feature_columns)
        print("")
        print("wide columns:")
        print("=============")
        print(wide_feature_columns)
        print("")
    
    return estimator

6. Run Experiment

a. Set HParam and RunConfig



In [15]:

    
TRAIN_SIZE = TRAIN_DATA_SIZE
NUM_EPOCHS = 100
BATCH_SIZE = 500
EVAL_AFTER_SEC = 60
TOTAL_STEPS = (TRAIN_SIZE/BATCH_SIZE)*NUM_EPOCHS

hparams  = tf.contrib.training.HParams(
    num_epochs = NUM_EPOCHS,
    batch_size = BATCH_SIZE,
    embedding_size = 4,
    hidden_units= [64, 32, 16],
    max_steps = TOTAL_STEPS
)

model_dir = 'trained_models/{}'.format(MODEL_NAME)

run_config = tf.estimator.RunConfig(
    log_step_count_steps=5000,
    tf_random_seed=19830610,
    model_dir=model_dir
)

print(hparams)
print("Model Directory:", run_config.model_dir)
print("")
print("Dataset Size:", TRAIN_SIZE)
print("Batch Size:", BATCH_SIZE)
print("Steps per Epoch:",TRAIN_SIZE/BATCH_SIZE)
print("Total Steps:", TOTAL_STEPS)
print("That is 1 evaluation step after each",EVAL_AFTER_SEC," training seconds")









    



[('batch_size', 500), ('embedding_size', 4), ('hidden_units', [64, 32, 16]), ('max_steps', 6512.2), ('num_epochs', 100)]
Model Directory: trained_models/cenus-model-01

Dataset Size: 32561
Batch Size: 500
Steps per Epoch: 65.122
Total Steps: 6512.2
That is 1 evaluation step after each 60  training seconds

b. Define TrainSpec and EvaluSpec



In [16]:

    
train_spec = tf.estimator.TrainSpec(
    input_fn = lambda: csv_input_fn(
        TRAIN_DATA_FILES_PATTERN,
        mode = tf.estimator.ModeKeys.TRAIN,
        num_epochs=hparams.num_epochs,
        batch_size=hparams.batch_size
    ),
    max_steps=hparams.max_steps,
    hooks=None
)

eval_spec = tf.estimator.EvalSpec(
    input_fn = lambda: csv_input_fn(
        TRAIN_DATA_FILES_PATTERN,
        mode=tf.estimator.ModeKeys.EVAL,
        num_epochs=1,
        batch_size=hparams.batch_size,
            
    ),
    throttle_secs = EVAL_AFTER_SEC,
    steps=None
)

c. Run Experiment via train_and_evaluate



In [17]:

    
if not RESUME_TRAINING:
    print("Removing previous artifacts...")
    shutil.rmtree(model_dir, ignore_errors=True)
else:
    print("Resuming training...") 

    
tf.logging.set_verbosity(tf.logging.INFO)

time_start = datetime.utcnow() 
print("Experiment started at {}".format(time_start.strftime("%H:%M:%S")))
print(".......................................") 

estimator = create_DNNComb_estimator(run_config, hparams, True)

tf.estimator.train_and_evaluate(
    estimator=estimator,
    train_spec=train_spec, 
    eval_spec=eval_spec
)

time_end = datetime.utcnow() 
print(".......................................")
print("Experiment finished at {}".format(time_end.strftime("%H:%M:%S")))
print("")
time_elapsed = time_end - time_start
print("Experiment elapsed time: {} seconds".format(time_elapsed.total_seconds()))









    



Removing previous artifacts...
Experiment started at 18:12:10
.......................................
INFO:tensorflow:Using config: {'_model_dir': 'trained_models/cenus-model-01', '_tf_random_seed': 19830610, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 5000, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x11eb7e278>, '_task_type': 'worker', '_task_id': 0, '_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}

*Estimator Type:
================
<class 'tensorflow.python.estimator.canned.dnn_linear_combined.DNNLinearCombinedClassifier'>

*deep columns:
==============
[_NumericColumn(key='age', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=<function get_feature_columns.<locals>.<lambda> at 0x125d0ba60>), _NumericColumn(key='education_num', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=<function get_feature_columns.<locals>.<lambda> at 0x125d20048>), _NumericColumn(key='capital_gain', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=<function get_feature_columns.<locals>.<lambda> at 0x125d208c8>), _NumericColumn(key='capital_loss', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=<function get_feature_columns.<locals>.<lambda> at 0x125d20950>), _NumericColumn(key='hours_per_week', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=<function get_feature_columns.<locals>.<lambda> at 0x125d0be18>), _EmbeddingColumn(categorical_column=_HashedCategoricalColumn(key='native_country', hash_bucket_size=100, dtype=tf.string), dimension=4, combiner='mean', initializer=<tensorflow.python.ops.init_ops.TruncatedNormal object at 0x11ecc0f28>, ckpt_to_load_from=None, tensor_name_in_ckpt=None, max_norm=None, trainable=True), _EmbeddingColumn(categorical_column=_HashedCategoricalColumn(key='occupation', hash_bucket_size=50, dtype=tf.string), dimension=4, combiner='mean', initializer=<tensorflow.python.ops.init_ops.TruncatedNormal object at 0x11ecc30f0>, ckpt_to_load_from=None, tensor_name_in_ckpt=None, max_norm=None, trainable=True), _EmbeddingColumn(categorical_column=_CrossedColumn(keys=('education', 'occupation'), hash_bucket_size=10000, hash_key=None), dimension=4, combiner='mean', initializer=<tensorflow.python.ops.init_ops.TruncatedNormal object at 0x11ecc3588>, ckpt_to_load_from=None, tensor_name_in_ckpt=None, max_norm=None, trainable=True), _EmbeddingColumn(categorical_column=_CrossedColumn(keys=('native_country', 'occupation'), hash_bucket_size=10000, hash_key=None), dimension=4, combiner='mean', initializer=<tensorflow.python.ops.init_ops.TruncatedNormal object at 0x11ecc35c0>, ckpt_to_load_from=None, tensor_name_in_ckpt=None, max_norm=None, trainable=True), _IndicatorColumn(categorical_column=_VocabularyListCategoricalColumn(key='gender', vocabulary_list=('Female', 'Male'), dtype=tf.string, default_value=-1, num_oov_buckets=0)), _IndicatorColumn(categorical_column=_VocabularyListCategoricalColumn(key='race', vocabulary_list=('Amer-Indian-Eskimo', 'Asian-Pac-Islander', 'Black', 'Other', 'White'), dtype=tf.string, default_value=-1, num_oov_buckets=0)), _IndicatorColumn(categorical_column=_VocabularyListCategoricalColumn(key='education', vocabulary_list=('Bachelors', 'HS-grad', '11th', 'Masters', '9th', 'Some-college', 'Assoc-acdm', 'Assoc-voc', '7th-8th', 'Doctorate', 'Prof-school', '5th-6th', '10th', '1st-4th', 'Preschool', '12th'), dtype=tf.string, default_value=-1, num_oov_buckets=0)), _IndicatorColumn(categorical_column=_VocabularyListCategoricalColumn(key='marital_status', vocabulary_list=('Married-civ-spouse', 'Divorced', 'Married-spouse-absent', 'Never-married', 'Separated', 'Married-AF-spouse', 'Widowed'), dtype=tf.string, default_value=-1, num_oov_buckets=0)), _IndicatorColumn(categorical_column=_VocabularyListCategoricalColumn(key='relationship', vocabulary_list=('Husband', 'Not-in-family', 'Wife', 'Own-child', 'Unmarried', 'Other-relative'), dtype=tf.string, default_value=-1, num_oov_buckets=0)), _IndicatorColumn(categorical_column=_VocabularyListCategoricalColumn(key='workclass', vocabulary_list=('Self-emp-not-inc', 'Private', 'State-gov', 'Federal-gov', 'Local-gov', '?', 'Self-emp-inc', 'Without-pay', 'Never-worked'), dtype=tf.string, default_value=-1, num_oov_buckets=0)), _IndicatorColumn(categorical_column=_IdentityCategoricalColumn(key='capital_indicator', num_buckets=2, default_value=0)), _IndicatorColumn(categorical_column=_BucketizedColumn(source_column=_NumericColumn(key='age', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=<function get_feature_columns.<locals>.<lambda> at 0x125d0ba60>), boundaries=(18, 25, 30, 35, 40, 45, 50, 55, 60, 65)))]

wide columns:
=============
[_VocabularyListCategoricalColumn(key='gender', vocabulary_list=('Female', 'Male'), dtype=tf.string, default_value=-1, num_oov_buckets=0), _VocabularyListCategoricalColumn(key='race', vocabulary_list=('Amer-Indian-Eskimo', 'Asian-Pac-Islander', 'Black', 'Other', 'White'), dtype=tf.string, default_value=-1, num_oov_buckets=0), _VocabularyListCategoricalColumn(key='education', vocabulary_list=('Bachelors', 'HS-grad', '11th', 'Masters', '9th', 'Some-college', 'Assoc-acdm', 'Assoc-voc', '7th-8th', 'Doctorate', 'Prof-school', '5th-6th', '10th', '1st-4th', 'Preschool', '12th'), dtype=tf.string, default_value=-1, num_oov_buckets=0), _VocabularyListCategoricalColumn(key='marital_status', vocabulary_list=('Married-civ-spouse', 'Divorced', 'Married-spouse-absent', 'Never-married', 'Separated', 'Married-AF-spouse', 'Widowed'), dtype=tf.string, default_value=-1, num_oov_buckets=0), _VocabularyListCategoricalColumn(key='relationship', vocabulary_list=('Husband', 'Not-in-family', 'Wife', 'Own-child', 'Unmarried', 'Other-relative'), dtype=tf.string, default_value=-1, num_oov_buckets=0), _VocabularyListCategoricalColumn(key='workclass', vocabulary_list=('Self-emp-not-inc', 'Private', 'State-gov', 'Federal-gov', 'Local-gov', '?', 'Self-emp-inc', 'Without-pay', 'Never-worked'), dtype=tf.string, default_value=-1, num_oov_buckets=0), _IdentityCategoricalColumn(key='capital_indicator', num_buckets=2, default_value=0), _BucketizedColumn(source_column=_NumericColumn(key='age', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=<function get_feature_columns.<locals>.<lambda> at 0x125d0ba60>), boundaries=(18, 25, 30, 35, 40, 45, 50, 55, 60, 65)), _HashedCategoricalColumn(key='occupation', hash_bucket_size=50, dtype=tf.string), _HashedCategoricalColumn(key='native_country', hash_bucket_size=100, dtype=tf.string), _CrossedColumn(keys=('education', 'occupation'), hash_bucket_size=10000, hash_key=None), _CrossedColumn(keys=(_BucketizedColumn(source_column=_NumericColumn(key='age', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=<function get_feature_columns.<locals>.<lambda> at 0x125d0ba60>), boundaries=(18, 25, 30, 35, 40, 45, 50, 55, 60, 65)), _VocabularyListCategoricalColumn(key='race', vocabulary_list=('Amer-Indian-Eskimo', 'Asian-Pac-Islander', 'Black', 'Other', 'White'), dtype=tf.string, default_value=-1, num_oov_buckets=0)), hash_bucket_size=10000, hash_key=None), _CrossedColumn(keys=('native_country', 'occupation'), hash_bucket_size=10000, hash_key=None)]

INFO:tensorflow:Running training and evaluation locally (non-distributed).
INFO:tensorflow:Start train and evaluate loop. The evaluate will happen after 60 secs (eval_spec.throttle_secs) or training is finished.

* data input_fn:
================
Input file(s): data/adult.data.csv
Batch size: 500
Epoch Count: 100
Mode: train
Thread Count: 4
Shuffle: True
================

INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into trained_models/cenus-model-01/model.ckpt.
INFO:tensorflow:loss = 2.37785e+08, step = 1
INFO:tensorflow:loss = 3.59773e+07, step = 101 (1.883 sec)
INFO:tensorflow:loss = 3.19562e+07, step = 201 (1.039 sec)
INFO:tensorflow:loss = 2.89906e+07, step = 301 (1.040 sec)
INFO:tensorflow:loss = 2.81271e+07, step = 401 (1.094 sec)
INFO:tensorflow:loss = 2.49637e+07, step = 501 (1.129 sec)
INFO:tensorflow:loss = 3.21887e+07, step = 601 (1.025 sec)
INFO:tensorflow:loss = 4.37688e+07, step = 701 (1.130 sec)
INFO:tensorflow:loss = 2.6423e+07, step = 801 (1.094 sec)
INFO:tensorflow:loss = 2.53036e+07, step = 901 (1.024 sec)
INFO:tensorflow:loss = 2.53407e+07, step = 1001 (1.026 sec)
INFO:tensorflow:loss = 2.63034e+07, step = 1101 (1.050 sec)
INFO:tensorflow:loss = 2.89406e+07, step = 1201 (1.023 sec)
INFO:tensorflow:loss = 2.81525e+07, step = 1301 (1.035 sec)
INFO:tensorflow:loss = 2.51829e+07, step = 1401 (1.030 sec)
INFO:tensorflow:loss = 2.23804e+07, step = 1501 (1.018 sec)
INFO:tensorflow:loss = 2.41746e+07, step = 1601 (1.034 sec)
INFO:tensorflow:loss = 2.6517e+07, step = 1701 (1.086 sec)
INFO:tensorflow:loss = 2.77893e+07, step = 1801 (1.016 sec)
INFO:tensorflow:loss = 2.6174e+07, step = 1901 (1.028 sec)
INFO:tensorflow:loss = 2.78344e+07, step = 2001 (1.011 sec)
INFO:tensorflow:loss = 2.4587e+07, step = 2101 (1.028 sec)
INFO:tensorflow:loss = 2.36612e+07, step = 2201 (1.057 sec)
INFO:tensorflow:loss = 3.10125e+07, step = 2301 (1.094 sec)
INFO:tensorflow:loss = 3.13916e+07, step = 2401 (1.095 sec)
INFO:tensorflow:loss = 2.51005e+07, step = 2501 (1.078 sec)
INFO:tensorflow:loss = 2.57023e+07, step = 2601 (1.074 sec)
INFO:tensorflow:loss = 2.26069e+07, step = 2701 (1.025 sec)
INFO:tensorflow:loss = 2.84692e+07, step = 2801 (1.067 sec)
INFO:tensorflow:loss = 2.72092e+07, step = 2901 (1.012 sec)
INFO:tensorflow:loss = 2.60908e+07, step = 3001 (1.033 sec)
INFO:tensorflow:loss = 2.43139e+07, step = 3101 (1.002 sec)
INFO:tensorflow:loss = 2.73729e+07, step = 3201 (1.018 sec)
INFO:tensorflow:loss = 3.24506e+07, step = 3301 (1.021 sec)
INFO:tensorflow:loss = 2.46407e+07, step = 3401 (1.012 sec)
INFO:tensorflow:loss = 2.44178e+07, step = 3501 (0.986 sec)
INFO:tensorflow:loss = 2.13806e+07, step = 3601 (1.080 sec)
INFO:tensorflow:loss = 2.66112e+07, step = 3701 (1.094 sec)
INFO:tensorflow:loss = 2.08171e+07, step = 3801 (0.892 sec)
INFO:tensorflow:loss = 2.30155e+07, step = 3901 (1.159 sec)
INFO:tensorflow:loss = 2.60285e+07, step = 4001 (0.930 sec)
INFO:tensorflow:Saving checkpoints for 4079 into trained_models/cenus-model-01/model.ckpt.
INFO:tensorflow:Loss for final step: 2.18222e+07.

* data input_fn:
================
Input file(s): data/adult.data.csv
Batch size: 500
Epoch Count: 1
Mode: eval
Thread Count: 4
Shuffle: False
================

WARNING:tensorflow:Casting <dtype: 'float32'> labels to bool.
WARNING:tensorflow:Casting <dtype: 'float32'> labels to bool.
INFO:tensorflow:Starting evaluation at 2018-03-02-18:13:27
INFO:tensorflow:Restoring parameters from trained_models/cenus-model-01/model.ckpt-4079
INFO:tensorflow:Finished evaluation at 2018-03-02-18:13:30
INFO:tensorflow:Saving dict for global step 4079: accuracy = 0.881621, accuracy_baseline = 0.761441, auc = 0.9411, auc_precision_recall = 0.838418, average_loss = 0.259436, global_step = 4079, label/mean = 0.238559, loss = 2.42901e+07, prediction/mean = 0.234475

* data input_fn:
================
Input file(s): data/adult.data.csv
Batch size: 500
Epoch Count: 100
Mode: train
Thread Count: 4
Shuffle: True
================

INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Restoring parameters from trained_models/cenus-model-01/model.ckpt-4079
INFO:tensorflow:Saving checkpoints for 4080 into trained_models/cenus-model-01/model.ckpt.
INFO:tensorflow:loss = 2.46946e+07, step = 4080
INFO:tensorflow:loss = 2.73342e+07, step = 4180 (1.307 sec)
INFO:tensorflow:loss = 2.5431e+07, step = 4280 (0.704 sec)
INFO:tensorflow:loss = 2.53493e+07, step = 4380 (0.686 sec)
INFO:tensorflow:loss = 2.35129e+07, step = 4480 (0.691 sec)
INFO:tensorflow:loss = 2.19496e+07, step = 4580 (0.689 sec)
INFO:tensorflow:loss = 2.51637e+07, step = 4680 (0.689 sec)
INFO:tensorflow:loss = 3.13484e+07, step = 4780 (0.682 sec)
INFO:tensorflow:loss = 2.14154e+07, step = 4880 (0.682 sec)
INFO:tensorflow:loss = 2.11767e+07, step = 4980 (0.689 sec)
INFO:tensorflow:loss = 2.12241e+07, step = 5080 (0.702 sec)
INFO:tensorflow:loss = 2.14688e+07, step = 5180 (0.685 sec)
INFO:tensorflow:loss = 2.48392e+07, step = 5280 (0.685 sec)
INFO:tensorflow:loss = 2.55624e+07, step = 5380 (0.683 sec)
INFO:tensorflow:loss = 2.21063e+07, step = 5480 (0.676 sec)
INFO:tensorflow:loss = 2.13232e+07, step = 5580 (0.710 sec)
INFO:tensorflow:loss = 2.04125e+07, step = 5680 (0.728 sec)
INFO:tensorflow:loss = 2.37316e+07, step = 5780 (0.702 sec)
INFO:tensorflow:loss = 2.20543e+07, step = 5880 (0.691 sec)
INFO:tensorflow:loss = 2.34455e+07, step = 5980 (0.687 sec)
INFO:tensorflow:loss = 2.60935e+07, step = 6080 (0.688 sec)
INFO:tensorflow:loss = 2.16544e+07, step = 6180 (0.681 sec)
INFO:tensorflow:loss = 2.19973e+07, step = 6280 (0.677 sec)
INFO:tensorflow:loss = 2.62537e+07, step = 6380 (0.750 sec)
INFO:tensorflow:loss = 2.71262e+07, step = 6480 (0.742 sec)
INFO:tensorflow:Saving checkpoints for 6513 into trained_models/cenus-model-01/model.ckpt.
INFO:tensorflow:Loss for final step: 2.38859e+07.

* data input_fn:
================
Input file(s): data/adult.data.csv
Batch size: 500
Epoch Count: 1
Mode: eval
Thread Count: 4
Shuffle: False
================

WARNING:tensorflow:Casting <dtype: 'float32'> labels to bool.
WARNING:tensorflow:Casting <dtype: 'float32'> labels to bool.
INFO:tensorflow:Starting evaluation at 2018-03-02-18:14:17
INFO:tensorflow:Restoring parameters from trained_models/cenus-model-01/model.ckpt-6513
INFO:tensorflow:Finished evaluation at 2018-03-02-18:14:20
INFO:tensorflow:Saving dict for global step 6513: accuracy = 0.892401, accuracy_baseline = 0.761441, auc = 0.951385, auc_precision_recall = 0.864894, average_loss = 0.23772, global_step = 6513, label/mean = 0.238559, loss = 2.2257e+07, prediction/mean = 0.234463
.......................................
Experiment finished at 18:14:20

Experiment elapsed time: 130.234232 seconds

Evaluate the Model



In [18]:

    
TRAIN_SIZE = TRAIN_DATA_SIZE
TEST_SIZE = TEST_DATA_SIZE

train_input_fn = lambda: csv_input_fn(files_name_pattern= TRAIN_DATA_FILES_PATTERN, 
                                      mode= tf.estimator.ModeKeys.EVAL,
                                      batch_size= TRAIN_SIZE)

test_input_fn = lambda: csv_input_fn(files_name_pattern= TEST_DATA_FILES_PATTERN, 
                                      mode= tf.estimator.ModeKeys.EVAL,
                                      batch_size= TEST_SIZE)

estimator = create_DNNComb_estimator(run_config, hparams)

train_results = estimator.evaluate(input_fn=train_input_fn, steps=1)
print()
print("######################################################################################")
print("# Train Measures: {}".format(train_results))
print("######################################################################################")

test_results = estimator.evaluate(input_fn=test_input_fn, steps=1)
print()
print("######################################################################################")
print("# Test Measures: {}".format(test_results))
print("######################################################################################")









    



INFO:tensorflow:Using config: {'_model_dir': 'trained_models/cenus-model-01', '_tf_random_seed': 19830610, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 5000, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x11eb7e278>, '_task_type': 'worker', '_task_id': 0, '_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}

* data input_fn:
================
Input file(s): data/adult.data.csv
Batch size: 32561
Epoch Count: None
Mode: eval
Thread Count: 4
Shuffle: False
================

WARNING:tensorflow:Casting <dtype: 'float32'> labels to bool.
WARNING:tensorflow:Casting <dtype: 'float32'> labels to bool.
INFO:tensorflow:Starting evaluation at 2018-03-02-18:14:24
INFO:tensorflow:Restoring parameters from trained_models/cenus-model-01/model.ckpt-6513
INFO:tensorflow:Evaluation [1/1]
INFO:tensorflow:Finished evaluation at 2018-03-02-18:14:27
INFO:tensorflow:Saving dict for global step 6513: accuracy = 0.892401, accuracy_baseline = 0.761441, auc = 0.951385, auc_precision_recall = 0.864894, average_loss = 0.237719, global_step = 6513, label/mean = 0.238559, loss = 1.46895e+09, prediction/mean = 0.234462

######################################################################################
# Train Measures: {'accuracy': 0.89240098, 'accuracy_baseline': 0.76144063, 'auc': 0.9513855, 'auc_precision_recall': 0.86489427, 'average_loss': 0.23771885, 'label/mean': 0.23855937, 'loss': 1.4689537e+09, 'prediction/mean': 0.2344622, 'global_step': 6513}
######################################################################################

* data input_fn:
================
Input file(s): data/adult.test.csv
Batch size: 16278
Epoch Count: None
Mode: eval
Thread Count: 4
Shuffle: False
================

WARNING:tensorflow:Casting <dtype: 'float32'> labels to bool.
WARNING:tensorflow:Casting <dtype: 'float32'> labels to bool.
INFO:tensorflow:Starting evaluation at 2018-03-02-18:14:30
INFO:tensorflow:Restoring parameters from trained_models/cenus-model-01/model.ckpt-6513
INFO:tensorflow:Evaluation [1/1]
INFO:tensorflow:Finished evaluation at 2018-03-02-18:14:32
INFO:tensorflow:Saving dict for global step 6513: accuracy = 0.850576, accuracy_baseline = 0.763804, auc = 0.896857, auc_precision_recall = 0.754346, average_loss = 0.356806, global_step = 6513, label/mean = 0.236196, loss = 1.10023e+09, prediction/mean = 0.231706

######################################################################################
# Test Measures: {'accuracy': 0.85057604, 'accuracy_baseline': 0.76380354, 'auc': 0.89685661, 'auc_precision_recall': 0.75434619, 'average_loss': 0.35680604, 'label/mean': 0.23619646, 'loss': 1.1002327e+09, 'prediction/mean': 0.23170602, 'global_step': 6513}
######################################################################################

Prediction



In [19]:

    
import itertools

predict_input_fn = lambda: csv_input_fn(TEST_DATA_FILES_PATTERN, 
                                      mode= tf.estimator.ModeKeys.PREDICT,
                                      batch_size= 10)


predictions = list(itertools.islice(estimator.predict(input_fn=predict_input_fn),10))

print("")
print("* Predicted Classes: {}".format(list(map(lambda item: item["class_ids"][0]
    ,predictions))))

print("* Predicted Probabilities: {}".format(list(map(lambda item: list(item["probabilities"])
    ,predictions))))









    



* data input_fn:
================
Input file(s): data/adult.test.csv
Batch size: 10
Epoch Count: None
Mode: infer
Thread Count: 4
Shuffle: False
================

WARNING:tensorflow:Input graph does not contain a QueueRunner. That means predict yields forever. This is probably a mistake.
INFO:tensorflow:Restoring parameters from trained_models/cenus-model-01/model.ckpt-6513

* Predicted Classes: [0, 0, 0, 1, 0, 0, 0, 1, 0, 0]
* Predicted Probabilities: [[0.99998593, 1.4065748e-05], [0.99092144, 0.009078579], [0.70425093, 0.29574913], [0.056977611, 0.94302231], [0.99998963, 1.0324979e-05], [0.99997151, 2.8453556e-05], [0.99982697, 0.00017310471], [0.40415996, 0.59583998], [0.99310654, 0.0068934429], [0.99930429, 0.00069568807]]

	age	workclass	fnlwgt	education	education_num	marital_status	occupation	relationship	race	gender	capital_gain	hours_per_week	native_country	income_bracket
0	39	State-gov	77516	Bachelors	13	Never-married	Adm-clerical	Not-in-family	White	Male	2174	40	United-States	<=50K
1	50	Self-emp-not-inc	83311	Bachelors	13	Married-civ-spouse	Exec-managerial	Husband	White	Male	0	13	United-States	<=50K
2	38	Private	215646	HS-grad	9	Divorced	Handlers-cleaners	Not-in-family	White	Male	0	40	United-States	<=50K
3	53	Private	234721	11th	7	Married-civ-spouse	Handlers-cleaners	Husband	Black	Male	0	40	United-States	<=50K
4	28	Private	338409	Bachelors	13	Married-civ-spouse	Prof-specialty	Wife	Black	Female	0	40	Cuba	<=50K
5	37	Private	284582	Masters	14	Married-civ-spouse	Exec-managerial	Wife	White	Female	0	40	United-States	<=50K
6	49	Private	160187	9th	5	Married-spouse-absent	Other-service	Not-in-family	Black	Female	0	16	Jamaica	<=50K
7	52	Self-emp-not-inc	209642	HS-grad	9	Married-civ-spouse	Exec-managerial	Husband	White	Male	0	45	United-States	>50K
8	31	Private	45781	Masters	14	Never-married	Prof-specialty	Not-in-family	White	Female	14084	50	United-States	>50K
9	42	Private	159449	Bachelors	13	Married-civ-spouse	Exec-managerial	Husband	White	Male	5178	40	United-States	>50K

	age	fnlwgt	education_num	capital_gain	capital_loss	hours_per_week
count	32561.000000	3.256100e+04	32561.000000	32561.000000	32561.000000	32561.000000
mean	38.581647	1.897784e+05	10.080679	1077.648844	87.303830	40.437456
std	13.640433	1.055500e+05	2.572720	7385.292085	402.960219	12.347429
min	17.000000	1.228500e+04	1.000000	0.000000	0.000000	1.000000
25%	28.000000	1.178270e+05	9.000000	0.000000	0.000000	40.000000
50%	37.000000	1.783560e+05	10.000000	0.000000	0.000000	40.000000
75%	48.000000	2.370510e+05	12.000000	0.000000	0.000000	45.000000
max	90.000000	1.484705e+06	16.000000	99999.000000	4356.000000	99.000000

	max	mean	min	stdv
age	90	38.581647	17	13.640433
education_num	16	10.080679	1	2.572720
capital_gain	99999	1077.648844	0	7385.292085
capital_loss	4356	87.303830	0	402.960219
hours_per_week	99	40.437456	1	12.347429